{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1、获取数据\n",
    "# 2、合并表\n",
    "# 3、找到user_id和aisle之间的关系\n",
    "# 4、PCA降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1、获取数据\n",
    "order_products = pd.read_csv(\"./instacart/order_products__prior.csv\")\n",
    "products = pd.read_csv(\"./instacart/products.csv\")\n",
    "orders = pd.read_csv(\"./instacart/orders.csv\")\n",
    "aisles = pd.read_csv(\"./instacart/aisles.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2、合并表\n",
    "# order_products__prior.csv：订单与商品信息\n",
    "\n",
    "# 字段：order_id, product_id, add_to_cart_order, reordered\n",
    "# products.csv：商品信息\n",
    "# 字段：product_id, product_name, aisle_id, department_id\n",
    "# orders.csv：用户的订单信息\n",
    "# 字段：order_id,user_id,eval_set,order_number,….\n",
    "# aisles.csv：商品所属具体物品类别\n",
    "# 字段： aisle_id, aisle\n",
    "\n",
    "# 合并aisles和products aisle和product_id\n",
    "tab1 = pd.merge(aisles, products, on=[\"aisle_id\", \"aisle_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tab2 = pd.merge(tab1, order_products, on=[\"product_id\", \"product_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tab3 = pd.merge(tab2, orders, on=[\"order_id\", \"order_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aisle_id</th>\n",
       "      <th>aisle</th>\n",
       "      <th>product_id</th>\n",
       "      <th>product_name</th>\n",
       "      <th>department_id</th>\n",
       "      <th>order_id</th>\n",
       "      <th>add_to_cart_order</th>\n",
       "      <th>reordered</th>\n",
       "      <th>user_id</th>\n",
       "      <th>eval_set</th>\n",
       "      <th>order_number</th>\n",
       "      <th>order_dow</th>\n",
       "      <th>order_hour_of_day</th>\n",
       "      <th>days_since_prior_order</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>prepared soups salads</td>\n",
       "      <td>209</td>\n",
       "      <td>Italian Pasta Salad</td>\n",
       "      <td>20</td>\n",
       "      <td>94246</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>114082</td>\n",
       "      <td>prior</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>prepared soups salads</td>\n",
       "      <td>22853</td>\n",
       "      <td>Pesto Pasta Salad</td>\n",
       "      <td>20</td>\n",
       "      <td>94246</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>114082</td>\n",
       "      <td>prior</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>instant foods</td>\n",
       "      <td>12087</td>\n",
       "      <td>Chicken Flavor Ramen Noodle Soup</td>\n",
       "      <td>9</td>\n",
       "      <td>94246</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>114082</td>\n",
       "      <td>prior</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>instant foods</td>\n",
       "      <td>47570</td>\n",
       "      <td>Original Flavor Macaroni &amp; Cheese Dinner</td>\n",
       "      <td>9</td>\n",
       "      <td>94246</td>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>114082</td>\n",
       "      <td>prior</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13</td>\n",
       "      <td>prepared meals</td>\n",
       "      <td>10089</td>\n",
       "      <td>Dolmas</td>\n",
       "      <td>20</td>\n",
       "      <td>94246</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>114082</td>\n",
       "      <td>prior</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   aisle_id                  aisle  product_id  \\\n",
       "0         1  prepared soups salads         209   \n",
       "1         1  prepared soups salads       22853   \n",
       "2         4          instant foods       12087   \n",
       "3         4          instant foods       47570   \n",
       "4        13         prepared meals       10089   \n",
       "\n",
       "                               product_name  department_id  order_id  \\\n",
       "0                       Italian Pasta Salad             20     94246   \n",
       "1                         Pesto Pasta Salad             20     94246   \n",
       "2          Chicken Flavor Ramen Noodle Soup              9     94246   \n",
       "3  Original Flavor Macaroni & Cheese Dinner              9     94246   \n",
       "4                                    Dolmas             20     94246   \n",
       "\n",
       "   add_to_cart_order  reordered  user_id eval_set  order_number  order_dow  \\\n",
       "0                  5          0   114082    prior            26          0   \n",
       "1                  4          0   114082    prior            26          0   \n",
       "2                 15          0   114082    prior            26          0   \n",
       "3                 14          1   114082    prior            26          0   \n",
       "4                 25          0   114082    prior            26          0   \n",
       "\n",
       "   order_hour_of_day  days_since_prior_order  \n",
       "0                 20                     1.0  \n",
       "1                 20                     1.0  \n",
       "2                 20                     1.0  \n",
       "3                 20                     1.0  \n",
       "4                 20                     1.0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tab3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3、找到user_id和aisle之间的关系\n",
    "table = pd.crosstab(tab3[\"user_id\"], tab3[\"aisle\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = table[:10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4、PCA降维\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1）实例化一个转换器类\n",
    "transfer = PCA(n_components=0.95)\n",
    "\n",
    "# 2）调用fit_transform\n",
    "data_new = transfer.fit_transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 42)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_new.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
